{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Description:\n",
    "这是实验的数据预处理模块，此次实验使用的亚马逊产品数据集里面的Electronics子集， 具体详情描述可以参考：[http://jmcauley.ucsd.edu/data/amazon/](http://jmcauley.ucsd.edu/data/amazon/)。 这里用的2014年的那两个per-category dataset。大体思路分为两个部分：\n",
    "1. 把原始的json数据转成pd的形式， 从meta数据集中只保留在reviews文件中出现过的商品\n",
    "2. 把pd数据转成pkl数据， 后面用这个生成数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:57:02.291119Z",
     "start_time": "2021-02-18T01:57:01.862868Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import gc\n",
    "import random\n",
    "from tqdm import tqdm\n",
    "\n",
    "random.seed(2020)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:57:13.055968Z",
     "start_time": "2021-02-18T01:57:13.045953Z"
    }
   },
   "outputs": [],
   "source": [
    "def to_df(file_path):\n",
    "    \"\"\"\n",
    "        转换为DataFrame结构\n",
    "        file_path: 文件路径\n",
    "        return: DtaFrame\n",
    "    \"\"\"\n",
    "    with open(file_path, 'r') as fin:\n",
    "        df = {}\n",
    "        i = 0\n",
    "        for line in tqdm(fin):\n",
    "            #print(line)\n",
    "            df[i] = eval(line)   #   直接针对字符串运行\n",
    "            i += 1\n",
    "            \n",
    "            if i > 1000000:   # 笔记本内存不够了， 先提取少量一部分, 如果电脑允许，这里可以去掉\n",
    "                break\n",
    "        df = pd.DataFrame.from_dict(df, orient='index')\n",
    "        return df            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:57:55.548709Z",
     "start_time": "2021-02-18T01:57:14.270727Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "1000000it [00:37, 26838.86it/s]\n"
     ]
    }
   ],
   "source": [
    "# 处理review\n",
    "reviews_df = to_df('./raw_data/reviews_Electronics.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:57:55.752161Z",
     "start_time": "2021-02-18T01:57:55.722791Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>helpful</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>overall</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>reviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AKM1MP6P0OYPR</td>\n",
       "      <td>0132793040</td>\n",
       "      <td>Vicki Gibson \"momo4\"</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>Corey Barker does a great job of explaining Bl...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Very thorough</td>\n",
       "      <td>1365811200</td>\n",
       "      <td>04 13, 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2CX7LUOHB2NDG</td>\n",
       "      <td>0321732944</td>\n",
       "      <td>Bernie</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>While many beginner DVDs try to teach you ever...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Adobe Photoshop CS5 Crash Course with master P...</td>\n",
       "      <td>1341100800</td>\n",
       "      <td>07 1, 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2NWSAGRHCP8N5</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>bowmans2007</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>It never worked. My daughter worked to earn th...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>absolutely horrible</td>\n",
       "      <td>1367193600</td>\n",
       "      <td>04 29, 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A2WNBOD3WNDNKT</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>JAL</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>Some of the functions did not work properly.  ...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>Disappointing</td>\n",
       "      <td>1374451200</td>\n",
       "      <td>07 22, 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A1GI0U4ZRJA8WN</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>Truthfull</td>\n",
       "      <td>[4, 4]</td>\n",
       "      <td>Do not waste your money on this thing it is te...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>TERRIBLE DONT WASTE YOUR MONEY</td>\n",
       "      <td>1334707200</td>\n",
       "      <td>04 18, 2012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       reviewerID        asin          reviewerName helpful  \\\n",
       "0   AKM1MP6P0OYPR  0132793040  Vicki Gibson \"momo4\"  [1, 1]   \n",
       "1  A2CX7LUOHB2NDG  0321732944                Bernie  [0, 0]   \n",
       "2  A2NWSAGRHCP8N5  0439886341           bowmans2007  [1, 1]   \n",
       "3  A2WNBOD3WNDNKT  0439886341                   JAL  [1, 1]   \n",
       "4  A1GI0U4ZRJA8WN  0439886341             Truthfull  [4, 4]   \n",
       "\n",
       "                                          reviewText  overall  \\\n",
       "0  Corey Barker does a great job of explaining Bl...      5.0   \n",
       "1  While many beginner DVDs try to teach you ever...      5.0   \n",
       "2  It never worked. My daughter worked to earn th...      1.0   \n",
       "3  Some of the functions did not work properly.  ...      3.0   \n",
       "4  Do not waste your money on this thing it is te...      1.0   \n",
       "\n",
       "                                             summary  unixReviewTime  \\\n",
       "0                                      Very thorough      1365811200   \n",
       "1  Adobe Photoshop CS5 Crash Course with master P...      1341100800   \n",
       "2                                absolutely horrible      1367193600   \n",
       "3                                      Disappointing      1374451200   \n",
       "4                     TERRIBLE DONT WASTE YOUR MONEY      1334707200   \n",
       "\n",
       "    reviewTime  \n",
       "0  04 13, 2013  \n",
       "1   07 1, 2012  \n",
       "2  04 29, 2013  \n",
       "3  07 22, 2013  \n",
       "4  04 18, 2012  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:58:00.560848Z",
     "start_time": "2021-02-18T01:57:58.383673Z"
    }
   },
   "outputs": [],
   "source": [
    "with open('./raw_data/reviews.pkl', 'wb') as f:\n",
    "    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:58:02.263670Z",
     "start_time": "2021-02-18T01:58:02.183508Z"
    }
   },
   "outputs": [],
   "source": [
    "unique_asin = reviews_df['asin'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:58:03.463126Z",
     "start_time": "2021-02-18T01:58:03.302514Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "309"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del reviews_df\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:58:50.115627Z",
     "start_time": "2021-02-18T01:58:04.974042Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "498196it [00:42, 11777.44it/s]\n"
     ]
    }
   ],
   "source": [
    "# 处理meta_Electroics  从meta数据集中只保留在reviews文件中出现过的商品\n",
    "meta_df = to_df('./raw_data/meta_Electronics.json')\n",
    "meta_df = meta_df[meta_df['asin'].isin(unique_asin)]\n",
    "meta_df = meta_df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:54:40.996730Z",
     "start_time": "2021-02-18T01:54:40.975786Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>imUrl</th>\n",
       "      <th>description</th>\n",
       "      <th>categories</th>\n",
       "      <th>title</th>\n",
       "      <th>price</th>\n",
       "      <th>salesRank</th>\n",
       "      <th>related</th>\n",
       "      <th>brand</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0132793040</td>\n",
       "      <td>http://ecx.images-amazon.com/images/I/31JIPhp%...</td>\n",
       "      <td>The Kelby Training DVD Mastering Blend Modes i...</td>\n",
       "      <td>[[Electronics, Computers &amp; Accessories, Cables...</td>\n",
       "      <td>Kelby Training DVD: Mastering Blend Modes in A...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0321732944</td>\n",
       "      <td>http://ecx.images-amazon.com/images/I/31uogm6Y...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[[Electronics, Computers &amp; Accessories, Cables...</td>\n",
       "      <td>Kelby Training DVD: Adobe Photoshop CS5 Crash ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0439886341</td>\n",
       "      <td>http://ecx.images-amazon.com/images/I/51k0qa8f...</td>\n",
       "      <td>Digital Organizer and Messenger</td>\n",
       "      <td>[[Electronics, Computers &amp; Accessories, PDAs, ...</td>\n",
       "      <td>Digital Organizer and Messenger</td>\n",
       "      <td>8.15</td>\n",
       "      <td>{'Electronics': 144944}</td>\n",
       "      <td>{'also_viewed': ['0545016266', 'B009ECM8QY', '...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0511189877</td>\n",
       "      <td>http://ecx.images-amazon.com/images/I/41HaAhbv...</td>\n",
       "      <td>The CLIKR-5 UR5U-8780L remote control is desig...</td>\n",
       "      <td>[[Electronics, Accessories &amp; Supplies, Audio &amp;...</td>\n",
       "      <td>CLIKR-5 Time Warner Cable Remote Control UR5U-...</td>\n",
       "      <td>23.36</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0528881469</td>\n",
       "      <td>http://ecx.images-amazon.com/images/I/51FnRkJq...</td>\n",
       "      <td>Like its award-winning predecessor, the Intell...</td>\n",
       "      <td>[[Electronics, GPS &amp; Navigation, Vehicle GPS, ...</td>\n",
       "      <td>Rand McNally 528881469 7-inch Intelliroute TND...</td>\n",
       "      <td>299.99</td>\n",
       "      <td>NaN</td>\n",
       "      <td>{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin                                              imUrl  \\\n",
       "0  0132793040  http://ecx.images-amazon.com/images/I/31JIPhp%...   \n",
       "1  0321732944  http://ecx.images-amazon.com/images/I/31uogm6Y...   \n",
       "2  0439886341  http://ecx.images-amazon.com/images/I/51k0qa8f...   \n",
       "3  0511189877  http://ecx.images-amazon.com/images/I/41HaAhbv...   \n",
       "4  0528881469  http://ecx.images-amazon.com/images/I/51FnRkJq...   \n",
       "\n",
       "                                         description  \\\n",
       "0  The Kelby Training DVD Mastering Blend Modes i...   \n",
       "1                                                NaN   \n",
       "2                    Digital Organizer and Messenger   \n",
       "3  The CLIKR-5 UR5U-8780L remote control is desig...   \n",
       "4  Like its award-winning predecessor, the Intell...   \n",
       "\n",
       "                                          categories  \\\n",
       "0  [[Electronics, Computers & Accessories, Cables...   \n",
       "1  [[Electronics, Computers & Accessories, Cables...   \n",
       "2  [[Electronics, Computers & Accessories, PDAs, ...   \n",
       "3  [[Electronics, Accessories & Supplies, Audio &...   \n",
       "4  [[Electronics, GPS & Navigation, Vehicle GPS, ...   \n",
       "\n",
       "                                               title   price  \\\n",
       "0  Kelby Training DVD: Mastering Blend Modes in A...     NaN   \n",
       "1  Kelby Training DVD: Adobe Photoshop CS5 Crash ...     NaN   \n",
       "2                    Digital Organizer and Messenger    8.15   \n",
       "3  CLIKR-5 Time Warner Cable Remote Control UR5U-...   23.36   \n",
       "4  Rand McNally 528881469 7-inch Intelliroute TND...  299.99   \n",
       "\n",
       "                 salesRank                                            related  \\\n",
       "0                      NaN                                                NaN   \n",
       "1                      NaN                                                NaN   \n",
       "2  {'Electronics': 144944}  {'also_viewed': ['0545016266', 'B009ECM8QY', '...   \n",
       "3                      NaN  {'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...   \n",
       "4                      NaN  {'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...   \n",
       "\n",
       "  brand  \n",
       "0   NaN  \n",
       "1   NaN  \n",
       "2   NaN  \n",
       "3   NaN  \n",
       "4   NaN  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:58:53.418844Z",
     "start_time": "2021-02-18T01:58:53.095349Z"
    }
   },
   "outputs": [],
   "source": [
    "pickle.dump(meta_df, open('./raw_data/meta.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# remap_id\n",
    "这里再次进行处理， 基于上面的pkl文件， 处理如下：\n",
    "1. reviews_df保留'reviewerID'【用户ID】, 'asin'【产品ID】, 'unixReviewTime'【浏览时间】三列\n",
    "2. meta_df保留'asin'【产品ID】, 'categories'【种类】两列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:03.943341Z",
     "start_time": "2021-02-18T01:59:01.110907Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews = pd.read_pickle('./raw_data/reviews.pkl')\n",
    "reviews_df = reviews[['reviewerID', 'asin', 'unixReviewTime']]\n",
    "\n",
    "meta = pd.read_pickle('./raw_data/meta.pkl')\n",
    "meta_df = meta[['asin', 'categories']]\n",
    "\n",
    "del reviews, meta\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:06.119783Z",
     "start_time": "2021-02-18T01:59:06.078584Z"
    }
   },
   "outputs": [],
   "source": [
    "# meta_df只保留最后一个\n",
    "meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:07.318268Z",
     "start_time": "2021-02-18T01:59:07.301312Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>categories</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0132793040</td>\n",
       "      <td>Monitor Accessories</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0321732944</td>\n",
       "      <td>Monitor Accessories</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0439886341</td>\n",
       "      <td>PDAs &amp; Handhelds</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0511189877</td>\n",
       "      <td>TV Remote Controls</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0528881469</td>\n",
       "      <td>Trucking GPS</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin           categories\n",
       "0  0132793040  Monitor Accessories\n",
       "1  0321732944  Monitor Accessories\n",
       "2  0439886341     PDAs & Handhelds\n",
       "3  0511189877   TV Remote Controls\n",
       "4  0528881469         Trucking GPS"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:08.839199Z",
     "start_time": "2021-02-18T01:59:08.830224Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>unixReviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AKM1MP6P0OYPR</td>\n",
       "      <td>0132793040</td>\n",
       "      <td>1365811200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2CX7LUOHB2NDG</td>\n",
       "      <td>0321732944</td>\n",
       "      <td>1341100800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2NWSAGRHCP8N5</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>1367193600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A2WNBOD3WNDNKT</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>1374451200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A1GI0U4ZRJA8WN</td>\n",
       "      <td>0439886341</td>\n",
       "      <td>1334707200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       reviewerID        asin  unixReviewTime\n",
       "0   AKM1MP6P0OYPR  0132793040      1365811200\n",
       "1  A2CX7LUOHB2NDG  0321732944      1341100800\n",
       "2  A2NWSAGRHCP8N5  0439886341      1367193600\n",
       "3  A2WNBOD3WNDNKT  0439886341      1374451200\n",
       "4  A1GI0U4ZRJA8WN  0439886341      1334707200"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:10.258403Z",
     "start_time": "2021-02-18T01:59:10.247433Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(59634, 2) (1000001, 3)\n"
     ]
    }
   ],
   "source": [
    "print(meta_df.shape, reviews_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:13.043313Z",
     "start_time": "2021-02-18T01:59:12.414636Z"
    }
   },
   "outputs": [],
   "source": [
    "# 上面的这个数太大了还是， 所以这里在进行采样一波， 按照用户的reviewerID采样， 采样出10万的用户数据来\n",
    "select_user_id = np.random.choice(reviews_df['reviewerID'].unique(), size=100000, replace=False)\n",
    "reviews_df = reviews_df[reviews_df['reviewerID'].isin(select_user_id)]\n",
    "meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:28.503595Z",
     "start_time": "2021-02-18T01:59:28.486641Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(27266, 2) (132639, 3)\n"
     ]
    }
   ],
   "source": [
    "print(meta_df.shape, reviews_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:31.030835Z",
     "start_time": "2021-02-18T01:59:31.021860Z"
    }
   },
   "outputs": [],
   "source": [
    "def build_map(df, col_name):\n",
    "    \"\"\"\n",
    "    制作一个映射， 键为列名， 值为序列数字\n",
    "    df: review_df / meta_df\n",
    "    col_name: 列名\n",
    "    return: 字典， 键\n",
    "    \"\"\"\n",
    "    key = sorted(df[col_name].unique().tolist())\n",
    "    m = dict(zip(key, range(len(key))))          # 这个是建立字典的常用操作， 得记住这个写法 [值， 索引]\n",
    "    df[col_name] = df[col_name].map(lambda x: m[x])        # 这地方是把原来的值变为索引了？\n",
    "    return m, key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:32.547116Z",
     "start_time": "2021-02-18T01:59:32.334383Z"
    }
   },
   "outputs": [],
   "source": [
    "# 给物品ID， 物品种类， 用户ID，建立值 -> 索引的映射\n",
    "asin_map, asin_key = build_map(meta_df, 'asin')\n",
    "cate_map, cate_key = build_map(meta_df, 'categories')\n",
    "revi_map, revi_key = build_map(reviews_df, 'reviewerID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:33.901156Z",
     "start_time": "2021-02-18T01:59:33.894176Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100000 27266 650 132639\n"
     ]
    }
   ],
   "source": [
    "user_count, item_count, cate_count, example_count = len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]\n",
    "print(user_count, item_count, cate_count, example_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:35.481928Z",
     "start_time": "2021-02-18T01:59:35.470957Z"
    }
   },
   "outputs": [],
   "source": [
    "# 按物品id排序， 并重置索引\n",
    "meta_df = meta_df.sort_values('asin').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:36.459704Z",
     "start_time": "2021-02-18T01:59:36.342626Z"
    }
   },
   "outputs": [],
   "source": [
    "# reviews_df文件物品id进行映射， 并按照用户id，浏览时间进行排序重置索引\n",
    "reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])\n",
    "reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']).reset_index(drop=True)\n",
    "reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:38.296440Z",
     "start_time": "2021-02-18T01:59:38.286425Z"
    }
   },
   "outputs": [],
   "source": [
    "# 各个物品对应的类别\n",
    "cate_list = np.array(meta_df['categories'], dtype='int32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-02-18T01:59:39.386877Z",
     "start_time": "2021-02-18T01:59:39.350580Z"
    }
   },
   "outputs": [],
   "source": [
    "# 保存所需数据为pkl文件\n",
    "with open('./dataset/remap.pkl', 'wb') as f:\n",
    "    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump((user_count, item_count, cate_count, example_count), f, pickle.HIGHEST_PROTOCOL)\n",
    "    pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
